import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import roc_curve, auc
# DATA Pre-Processing
with open('/content/sample_data/drug_consumption.data', 'r') as file:
data = file.read()
# Naming all the columns with their associated names
column_name = ['ID','Age','Sex','Education','Country','Ethnicity','Neuroticism','Extraversion','Openness','Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing','Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
df = pd.read_csv(StringIO(data), header=None)
df.columns = column_name
df2=df.copy(deep=True)
### Mapping the columns with the values associated with the given data.
df.loc[df['Age'] == -0.95197, 'Age'] = "18-24"
df.loc[df['Age'] == -0.07854, 'Age'] = "25-34"
df.loc[df['Age'] == 0.49788, 'Age'] = "35-44"
df.loc[df['Age'] == 1.09449, 'Age'] = "45-54"
df.loc[df['Age'] == 1.82213, 'Age'] = "55-64"
df.loc[df['Age'] == 2.59171, 'Age'] = "65+"
df.loc[df['Sex'] == -0.48246, 'Sex'] = "Male"
df.loc[df['Sex'] == 0.48246, 'Sex'] = "Female"
education_mapping = {
-2.43591: 'Left school before 16 years',
-1.73790: 'Left school at 16 years',
-1.43719: 'Left school at 17 years',
-1.22751: 'Left school at 18 years',
-0.61113: 'Some college or university, no certificate or degree',
-0.05921: 'Professional certificate/ diploma',
0.45468: 'University degree',
1.16365: 'Masters degree',
1.98437: 'Doctorate degree'
}
df['Education'] = df['Education'].replace(education_mapping)
country_mapping = {
-0.09765: 'Australia',
0.24923: 'Canada',
-0.46841: 'New Zealand',
-0.28519: 'Other',
0.21128: 'Republic of Ireland',
0.96082: 'UK',
-0.57009: 'USA',
}
df['Country'] = df['Country'].replace(country_mapping)
ethnicity_mapping = {
-0.50212: 'Asian',
-1.10702: 'Black',
1.90725: 'Mixed-Black/Asian',
0.12600: 'Mixed-White/Asian',
-0.22166: 'Mixed-White/Black',
0.11440: 'Other',
-0.31685: 'White',
}
df['Ethnicity'] = df['Ethnicity'].replace(ethnicity_mapping)
drugs_mapping = {
'CL0': 'Never Used',
'CL1': 'Used over a Decade Ago',
'CL2': 'Used in Last Decade',
'CL3': 'Used in Last Year',
'CL4': 'Used in Last Month',
'CL5': 'Used in Last Week',
'CL6': 'Used in Last Day',
}
columns_to_categorize = ['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
for col in columns_to_categorize:
df[col] = df[col].replace(drugs_mapping)
# For all the columns with test scores we search on the internet the results considered as High, Average and Low which was High was when the score > mean+std , average = mean-std > score < mean+std, low = score < mean-std
columns_to_categorize_s = ['Neuroticism', 'Extraversion', 'Openness', 'Agreeableness', 'Conscientiousness','Impulsivness','Sensation Seeing']
# Apply the categorization to each specified column
for col in columns_to_categorize_s:
bins = [-float("inf"), df[col].mean() - df[col].std(), df[col].mean() + df[col].std(), float("inf")]
labels = ['Low '+col, 'Average '+col, 'High '+col]
df[col] = pd.cut(df[col], bins=bins, labels=labels)
df = df.set_index('ID')
df_standard = df.copy(deep=True)
df_standard.to_csv('df_standard.csv', index=False)
# Examen des valeurs uniques pour certaines colonnes catégorielles
unique_values = {}
columns_to_check = ['Age', 'Sex', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion', 'Openness', 'Agreeableness', 'Conscientiousness', 'Alcohol', 'Caffeine', 'Nicotine']
for col in columns_to_check:
unique_values[col] = df[col].unique()
unique_values
{'Age': array(['35-44', '25-34', '18-24', '65+', '45-54', '55-64'], dtype=object),
'Sex': array(['Female', 'Male'], dtype=object),
'Education': array(['Professional certificate/ diploma', 'Doctorate degree',
'Masters degree', 'Left school at 18 years',
'Left school at 16 years', 'University degree',
'Some college or university, no certificate or degree',
'Left school before 16 years', 'Left school at 17 years'],
dtype=object),
'Country': array(['UK', 'Canada', 'USA', 'Other', 'Australia', 'Republic of Ireland',
'New Zealand'], dtype=object),
'Ethnicity': array(['Mixed-White/Asian', 'White', 'Other', 'Mixed-White/Black',
'Asian', 'Black', 'Mixed-Black/Asian'], dtype=object),
'Neuroticism': ['Average Neuroticism', 'Low Neuroticism', 'High Neuroticism']
Categories (3, object): ['Low Neuroticism' < 'Average Neuroticism' < 'High Neuroticism'],
'Extraversion': ['Average Extraversion', 'High Extraversion', 'Low Extraversion']
Categories (3, object): ['Low Extraversion' < 'Average Extraversion' < 'High Extraversion'],
'Openness': ['Average Openness', 'High Openness', 'Low Openness']
Categories (3, object): ['Low Openness' < 'Average Openness' < 'High Openness'],
'Agreeableness': ['Average Agreeableness', 'Low Agreeableness', 'High Agreeableness']
Categories (3, object): ['Low Agreeableness' < 'Average Agreeableness' < 'High Agreeableness'],
'Conscientiousness': ['Average Conscientiousness', 'Low Conscientiousness', 'High Conscientiousness']
Categories (3, object): ['Low Conscientiousness' < 'Average Conscientiousness' < 'High Conscientiousness'],
'Alcohol': array(['Used in Last Week', 'Used in Last Day', 'Used in Last Month',
'Used in Last Decade', 'Used over a Decade Ago', 'Never Used',
'Used in Last Year'], dtype=object),
'Caffeine': array(['Used in Last Day', 'Used in Last Week', 'Used in Last Month',
'Used in Last Year', 'Never Used', 'Used over a Decade Ago',
'Used in Last Decade'], dtype=object),
'Nicotine': array(['Used in Last Decade', 'Used in Last Month', 'Never Used',
'Used in Last Day', 'Used over a Decade Ago', 'Used in Last Year',
'Used in Last Week'], dtype=object)}
df1 = df.copy(deep=True)
counts_by_country = df1['Country'].value_counts()
df1['CountryWithCount'] = df1['Country'].apply(lambda x: f"{x} ({counts_by_country[x]})")
fig = px.pie(df1, names="CountryWithCount", color="CountryWithCount", title="Pourcentage of people surveyed by country")
fig.show()
bar_fig = px.histogram(df1, x="CountryWithCount", color="CountryWithCount", title="Number of people surveyed by country")
bar_fig.update_layout(xaxis_title="Country", yaxis_title="Count")
bar_fig.show()
In this graph we see that the people from New Zealand represent 0.265% of the people of our survey and the people from the Republic of Ireland represent 1.06% of the answer. In order to have a more efficient model we set those peoples nationalities to other.
df1 = df.copy(deep=True)
counts_by_country = df1['Age'].value_counts()
df1['AgeWithCount'] = df1['Age'].apply(lambda x: f"{x} ({counts_by_country[x]})")
fig = px.pie(df1, names="AgeWithCount", color="AgeWithCount", title="Pourcentage of people surveyed by Age")
fig.show()
fig = px.histogram(df1, x="AgeWithCount", color="AgeWithCount", title="Number of people surveyed by Age")
fig.update_layout(xaxis_title="Age", yaxis_title="Count")
fig.show()
In this graph we see that the people who are more than 65 represent less than 1% of the people of our survey. In order to have a more efficient model we have to create a new category "55+" which englobes people from "55-64" and people from "65+".
df1 = df.copy(deep=True)
counts_by_country = df1['Education'].value_counts()
df1['EducationWithCount'] = df1['Education'].apply(lambda x: f"{x} ({counts_by_country[x]})")
fig = px.pie(df1, names="EducationWithCount", color="EducationWithCount", title="Pourcentage of people surveyed by Education")
fig.show()
fig = px.histogram(df1, x="EducationWithCount", color="EducationWithCount", title="Number of people surveyed by Education")
fig.update_layout(xaxis_title="Education", yaxis_title="Count")
fig.show()
We create a new category "Left school before 18" which englobes "left school at 17"+"Left school before 17"+"Left school before 16"
df_user = df.copy(deep=True)
drug_using = ['User_Alcohol','User_Amphet', 'User_Amyl', 'User_Benzos', 'User_Caff', 'User_Cannabis', 'User_Choc', 'User_Coke', 'User_Crack','User_Ecstasy', 'User_Heroin', 'User_Ketamine', 'User_Legalh', 'User_LSD', 'User_Meth', 'User_Mushrooms','User_Nicotine', 'User_Semer', 'User_VSA']
for i in range(len(columns_to_categorize)):
df_user.loc[((df_user[columns_to_categorize[i]]=="Never Used") | (df_user[columns_to_categorize[i]]=="Used over a Decade Ago")),drug_using[i]] = 'Non-user'
df_user.loc[((df_user[columns_to_categorize[i]]=="Used in Last Year") | (df_user[columns_to_categorize[i]]=="Used in Last Decade") | (df_user[columns_to_categorize[i]]=="Used in Last Day") | (df_user[columns_to_categorize[i]]=="Used in Last Week") | (df_user[columns_to_categorize[i]]=="Used in Last Month")),drug_using[i]] = 'User'
count_of_users = []
count_of_non_users = []
for i in range(len(columns_to_categorize)):
s = df_user.groupby([drug_using[i]])[columns_to_categorize[i]].count()
count_of_users.append(s[1])
count_of_non_users.append(s[0])
trace1 = go.Bar(
x=columns_to_categorize,
y=count_of_users,
name='User',
marker = dict(color="rgba(220, 20, 60, 0.7)")
)
trace2 = go.Bar(
x=columns_to_categorize,
y=count_of_non_users,
name='Non-User',
marker = dict(color="rgba(0, 128, 0, 0.7)")
)
data = [trace1, trace2]
layout = go.Layout(
title= 'Drug Vs User Or Non-user',
yaxis=dict(title='Count', ticklen=5, gridwidth=2),
barmode='group'
)
fig = go.Figure(data=data, layout=layout)
fig.show()
df4=df.copy(deep=True)
drugs_mapping = {
'Never Used': 0,
'Used over a Decade Ago': 1,
'Used in Last Decade': 2,
'Used in Last Year': 3,
'Used in Last Month': 4,
'Used in Last Week': 5,
'Used in Last Day': 6,
}
columns_to_categorize = ['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
for col in columns_to_categorize:
df4[col] = df4[col].replace(drugs_mapping)
df4.head()
| Age | Sex | Education | Country | Ethnicity | Neuroticism | Extraversion | Openness | Agreeableness | Conscientiousness | ... | Ecstasy | Heroin | Ketamine | Legal Highs | LSD | Methadone | Mushrooms | Nicotine | Semer | VSA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||||||||
| 1 | 35-44 | Female | Professional certificate/ diploma | UK | Mixed-White/Asian | Average Neuroticism | Average Extraversion | Average Openness | Average Agreeableness | Average Conscientiousness | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 |
| 2 | 25-34 | Male | Doctorate degree | UK | White | Average Neuroticism | High Extraversion | High Openness | Average Agreeableness | Average Conscientiousness | ... | 4 | 0 | 2 | 0 | 2 | 3 | 0 | 4 | 0 | 0 |
| 3 | 35-44 | Male | Professional certificate/ diploma | UK | White | Average Neuroticism | Average Extraversion | Average Openness | Low Agreeableness | Low Conscientiousness | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 18-24 | Female | Masters degree | UK | White | Average Neuroticism | Average Extraversion | Average Openness | Average Agreeableness | Average Conscientiousness | ... | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 0 |
| 5 | 35-44 | Female | Doctorate degree | UK | White | Average Neuroticism | Low Extraversion | Average Openness | Average Agreeableness | High Conscientiousness | ... | 1 | 0 | 0 | 1 | 0 | 0 | 2 | 2 | 0 | 0 |
5 rows × 31 columns
df4.replace({'New Zealand': 'Other', 'Republic of Ireland':'Other'}, inplace=True)
df4.replace({'55-64':'55+','65+':'55+'},inplace=True)
df4.replace({"Left school at 17 years":"Left school before 18 years", "Left school at 16 years":"Left school before 18 years","Left school before 16 years":"Left school before 18 years"}, inplace=True)
drugs_mapping = {
'CL0': 0,
'CL1': 1,
'CL2': 2,
'CL3': 3,
'CL4': 4,
'CL5': 5,
'CL6': 6,
}
columns_to_categorize = ['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
for col in columns_to_categorize:
df2[col] = df2[col].replace(drugs_mapping)
df2.head()
| ID | Age | Sex | Education | Country | Ethnicity | Neuroticism | Extraversion | Openness | Agreeableness | ... | Ecstasy | Heroin | Ketamine | Legal Highs | LSD | Methadone | Mushrooms | Nicotine | Semer | VSA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.49788 | 0.48246 | -0.05921 | 0.96082 | 0.12600 | 0.31287 | -0.57545 | -0.58331 | -0.91699 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 |
| 1 | 2 | -0.07854 | -0.48246 | 1.98437 | 0.96082 | -0.31685 | -0.67825 | 1.93886 | 1.43533 | 0.76096 | ... | 4 | 0 | 2 | 0 | 2 | 3 | 0 | 4 | 0 | 0 |
| 2 | 3 | 0.49788 | -0.48246 | -0.05921 | 0.96082 | -0.31685 | -0.46725 | 0.80523 | -0.84732 | -1.62090 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 4 | -0.95197 | 0.48246 | 1.16365 | 0.96082 | -0.31685 | -0.14882 | -0.80615 | -0.01928 | 0.59042 | ... | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 0 |
| 4 | 5 | 0.49788 | 0.48246 | 1.98437 | 0.96082 | -0.31685 | 0.73545 | -1.63340 | -0.45174 | -0.30172 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 2 | 2 | 0 | 0 |
5 rows × 32 columns
df2['Country'].replace({-0.46841: -0.28519, 0.21128:-0.28519}, inplace=True)
df2['Age'].replace({2.59171: 1.82213},inplace=True)
df2['Education'].replace({-2.43591:-1.43719,-1.73790:-1.43719}, inplace=True)
df3=df.copy(deep=True)
df3.replace({'New Zealand': 'Other', 'Republic of Ireland':'Other'}, inplace=True)
df3['Country'].unique()
array(['UK', 'Canada', 'USA', 'Other', 'Australia'], dtype=object)
df3.replace({'55-64':'55+','65+':'55+'},inplace=True)
df3['Age'].unique()
array(['35-44', '25-34', '18-24', '55+', '45-54'], dtype=object)
df3.replace({"Left school at 17 years":"Left school before 18 years", "Left school at 16 years":"Left school before 18 years","Left school before 16 years":"Left school before 18 years"}, inplace=True)
column_name = ['ID','Age','Sex','Education','Country','Ethnicity','Neuroticism','Extraversion','Openness','Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing','Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']
with open('/content/sample_data/drug_consumption.data', 'r') as file:
data = file.read()
df_for_heat = pd.read_csv(StringIO(data), header=None)
df_for_heat.columns = column_name
mapping = {
"CL0": 0,
"CL1": 1,
"CL2": 2,
"CL3": 3,
"CL4": 4,
"CL5": 5,
"CL6": 6
}
country_mapping_heat = {
-0.46841: -0.28519,
0.21128: -0.28519,
}
education_mapping_heat = {
-2.43591: -1.43719,
-1.73790: -1.43719,
}
age_mapping_heat = {
2.59171: 1.82213
}
df_for_heat = df_for_heat.replace(mapping)
df_for_heat['Country'] = df_for_heat['Country'].replace(country_mapping_heat)
df_for_heat['Education'] = df_for_heat['Education'].replace(education_mapping_heat)
df_for_heat['Age'] = df_for_heat['Age'].replace(age_mapping_heat)
df_for_heat = df_for_heat.drop("ID",axis=1)
df_for_heat = df_for_heat.reset_index(drop=True)
df_for_heat_reset = df_for_heat.reset_index(drop=True)
df_for_heat_corr = df_for_heat_reset.corr()
plt.figure(figsize=(20, 20))
fig = sns.heatmap(df_for_heat_corr, cmap='coolwarm', annot=True)
plt.show()
Standardizing the heatmap by drug
for drug in columns_to_categorize:
mean=df_for_heat_corr[drug].mean()
std=df_for_heat_corr[drug].std(ddof=0)
df_for_heat_corr[drug]=(df_for_heat_corr[drug]-mean)/std
Creating the dictionary that will have the pertaining features for each drug
dfs={}
for drug in columns_to_categorize:
drugs=columns_to_categorize[:]
drugs.remove(drug)
inter_down=df_for_heat_corr[drug].quantile(0.4)
inter_up=df_for_heat_corr[drug].quantile(0.6)
temp_df=df_for_heat_corr.copy(deep=True)
temp_df=temp_df[(temp_df[drug]>inter_up)|(temp_df[drug]<inter_down)]
temp_df.drop(columns=drugs,axis=1,inplace=True)
temp_df.drop(columns=['Age','Sex','Education','Country','Ethnicity','Neuroticism', 'Extraversion', 'Openness', 'Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing'],axis=1,inplace=True)
dfs[f"df_{drug}"]=temp_df
for drug in columns_to_categorize:
print(drug,len(dfs[f'df_{drug}']), dfs[f'df_{drug}'])
Alcohol 24 Alcohol Age -0.493926 Sex -0.370167 Education 0.364151 Country 0.086219 Ethnicity -0.053963 Neuroticism -0.366489 Extraversion 0.161909 Agreeableness -0.478270 Conscientiousness -0.357074 Sensation Seeing 0.252624 Alcohol 5.254320 Amphetamines -0.411966 Amyl Nitrite 0.130753 Benzodiazepine -0.413322 Caffeine 0.356214 Cocaine 0.146659 Crack -0.468818 Ecstasy 0.056900 Heroin -0.545480 Ketamine 0.013752 LSD -0.287919 Methadone -0.799265 Nicotine -0.006619 Semer -0.558521 Amphetamines 24 Amphetamines Age -1.501856 Sex -1.424431 Education -1.236616 Country -2.010839 Ethnicity -0.398753 Extraversion -0.818260 Agreeableness -1.178505 Conscientiousness -1.470542 Alcohol -0.714279 Amphetamines 2.661744 Benzodiazepine 0.992571 Caffeine -0.486889 Canabis 0.851960 Chocolate -0.882213 Cocaine 1.093883 Ecstasy 1.062414 Heroin 0.525452 Ketamine 0.567873 Legal Highs 0.906835 LSD 0.721730 Methadone 0.648377 Mushrooms 0.749677 Nicotine 0.496546 Semer -0.581816 Amyl Nitrite 24 Amyl Nitrite Age -1.148064 Sex -1.467539 Education -0.690678 Ethnicity -0.501307 Neuroticism -0.527611 Extraversion -0.542598 Openness -0.396912 Agreeableness -1.149406 Conscientiousness -1.253918 Sensation Seeing 0.250811 Amphetamines 0.831473 Amyl Nitrite 4.118429 Benzodiazepine 0.356920 Canabis 0.370838 Chocolate -0.678906 Cocaine 1.125521 Ecstasy 1.037419 Ketamine 0.964293 Legal Highs 0.562994 LSD 0.088099 Methadone -0.413390 Mushrooms 0.331973 Nicotine 0.385331 Semer -0.656507 Benzodiazepine 24 Benzodiazepine Age -1.149679 Sex -1.171353 Education -1.192893 Country -2.137862 Ethnicity -0.254375 Extraversion -1.061291 Agreeableness -1.283306 Conscientiousness -1.435523 Alcohol -0.722422 Amphetamines 1.134811 Benzodiazepine 2.950190 Caffeine -0.483413 Canabis 0.618258 Chocolate -0.874508 Cocaine 0.925654 Crack 0.578949 Ecstasy 0.559536 Heroin 0.867682 Legal Highs 0.614452 LSD 0.415335 Methadone 1.202613 Mushrooms 0.566929 Nicotine 0.418026 Semer -0.590376 Caffeine 24 Caffeine Sex -0.465533 Country -0.385860 Ethnicity 0.082401 Neuroticism -0.326510 Extraversion -0.089456 Agreeableness -0.494194 Conscientiousness -0.560497 Sensation Seeing -0.102616 Alcohol 0.327112 Amphetamines -0.068243 Amyl Nitrite 0.057064 Benzodiazepine -0.082728 Caffeine 5.337020 Chocolate 0.260328 Cocaine 0.039226 Crack -0.317795 Heroin -0.317034 Ketamine -0.320970 Legal Highs -0.362575 LSD -0.427172 Methadone -0.268187 Nicotine 0.325574 Semer -0.464896 VSA -0.072721 Canabis 24 Canabis Age -1.885530 Sex -1.457895 Education -1.401661 Country -2.198102 Neuroticism -0.287616 Extraversion -0.611500 Openness 0.651365 Agreeableness -1.006721 Conscientiousness -1.378918 Impulsivness 0.345961 Sensation Seeing 0.775061 Alcohol -0.501504 Amphetamines 0.782250 Benzodiazepine 0.487486 Caffeine -0.449722 Canabis 2.377802 Chocolate -0.758653 Cocaine 0.753369 Ecstasy 1.060449 Legal Highs 1.063601 LSD 0.966351 Mushrooms 1.139886 Nicotine 0.948955 Semer -0.439579 Chocolate 24 Chocolate Age 0.172826 Sex 0.299880 Education 0.081048 Country 0.558245 Ethnicity 0.033495 Neuroticism -0.029723 Extraversion 0.011328 Agreeableness 0.096180 Alcohol 0.147315 Amphetamines -0.416793 Amyl Nitrite -0.086846 Benzodiazepine -0.373392 Caffeine 0.516325 Canabis -0.438459 Chocolate 5.219498 Cocaine -0.427750 Crack -0.724267 Ecstasy -0.357804 Heroin -0.512602 Legal Highs -0.392684 LSD -0.512451 Methadone -0.332792 Mushrooms -0.489921 VSA -0.489911 Cocaine 24 Cocaine Age -1.546011 Sex -1.396869 Education -1.182455 Country -1.752817 Ethnicity -0.477363 Extraversion -0.672428 Agreeableness -1.471679 Conscientiousness -1.461494 Alcohol -0.467833 Amphetamines 1.072016 Benzodiazepine 0.765551 Caffeine -0.512549 Canabis 0.785323 Chocolate -0.997693 Cocaine 2.708446 Crack 0.581669 Ecstasy 1.350536 Heroin 0.699633 Ketamine 0.760482 Legal Highs 0.660420 LSD 0.559424 Mushrooms 0.707899 Nicotine 0.619524 Semer -0.618320 Crack 24 Crack Age -0.822680 Sex -1.247222 Education -1.229910 Country -1.568633 Ethnicity -0.469025 Extraversion -0.829153 Agreeableness -1.049848 Conscientiousness -1.172082 Alcohol -0.700647 Amphetamines 0.609413 Benzodiazepine 0.831823 Caffeine -0.556208 Canabis 0.360121 Chocolate -1.108672 Cocaine 1.009550 Crack 3.549062 Ecstasy 0.351305 Heroin 1.579097 LSD 0.342457 Methadone 0.866032 Mushrooms 0.472991 Nicotine 0.387227 Semer -0.506687 VSA 0.425834 Ecstasy 24 Ecstasy Age -1.925570 Sex -1.422561 Education -1.253586 Country -1.774366 Ethnicity -0.465697 Neuroticism -0.471680 Agreeableness -1.059359 Conscientiousness -1.386760 Sensation Seeing 0.542005 Alcohol -0.460093 Amphetamines 0.966766 Amyl Nitrite 0.448868 Benzodiazepine 0.396163 Caffeine -0.586237 Canabis 1.066917 Chocolate -0.850983 Cocaine 1.251042 Ecstasy 2.490815 Ketamine 0.924695 Legal Highs 1.070598 LSD 1.124148 Mushrooms 1.049582 Nicotine 0.517200 Semer -0.569879 Heroin 24 Heroin Age -1.083237 Sex -1.143190 Education -1.137117 Country -1.769579 Ethnicity -0.450206 Extraversion -0.924309 Agreeableness -1.271122 Conscientiousness -1.226796 Alcohol -0.746008 Amphetamines 0.776787 Benzodiazepine 1.032459 Caffeine -0.559001 Chocolate -0.917563 Cocaine 1.021088 Crack 1.418124 Ecstasy 0.409694 Heroin 3.242629 Ketamine 0.347425 Legal Highs 0.318841 LSD 0.462904 Methadone 1.229626 Mushrooms 0.419519 Semer -0.550219 VSA 0.372350 Ketamine 24 Ketamine Age -1.609272 Sex -1.482609 Education -1.071489 Country -1.168711 Ethnicity -0.593560 Neuroticism -0.470141 Extraversion -0.646612 Agreeableness -1.165683 Conscientiousness -1.336923 Amphetamines 0.775874 Amyl Nitrite 0.656195 Benzodiazepine 0.468940 Caffeine -0.665570 Canabis 0.517548 Chocolate -0.826410 Cocaine 1.048734 Ecstasy 1.315998 Heroin 0.278915 Ketamine 3.286900 Legal Highs 0.925252 LSD 0.921570 Mushrooms 0.922074 Nicotine 0.304104 Semer -0.483569 Legal Highs 24 Legal Highs Age -1.905834 Sex -1.570871 Education -1.305606 Country -1.918121 Ethnicity -0.342330 Extraversion -0.701326 Agreeableness -1.021906 Conscientiousness -1.379464 Sensation Seeing 0.682741 Alcohol -0.497569 Amphetamines 0.899633 Benzodiazepine 0.532537 Caffeine -0.563437 Canabis 1.146647 Chocolate -0.758534 Cocaine 0.705855 Ecstasy 1.146916 Ketamine 0.699219 Legal Highs 2.540059 LSD 0.890100 Methadone 0.430418 Mushrooms 1.077191 Nicotine 0.481358 Semer -0.534542 LSD 24 LSD Age -1.655357 Sex -1.508198 Education -1.228797 Country -2.206700 Neuroticism -0.490216 Extraversion -0.551179 Openness 0.581186 Agreeableness -0.912066 Conscientiousness -1.127243 Sensation Seeing 0.567584 Alcohol -0.570640 Amphetamines 0.741758 Benzodiazepine 0.365323 Caffeine -0.624211 Canabis 1.068437 Chocolate -0.861611 Cocaine 0.627138 Ecstasy 1.229141 Ketamine 0.710574 Legal Highs 0.910282 LSD 2.610982 Mushrooms 1.543741 Nicotine 0.331508 Semer -0.380206 Methadone 24 Methadone Age -1.240293 Sex -1.200067 Education -1.189178 Country -2.044143 Extraversion -0.980958 Agreeableness -1.109752 Conscientiousness -1.236320 Alcohol -0.824506 Amphetamines 0.922662 Amyl Nitrite -0.325659 Benzodiazepine 1.368426 Caffeine -0.449860 Canabis 0.546429 Chocolate -0.697708 Cocaine 0.725218 Crack 0.769754 Ecstasy 0.415054 Heroin 1.218051 Legal Highs 0.655662 LSD 0.442967 Methadone 3.130309 Mushrooms 0.481325 Semer -0.548798 VSA 0.413763 Mushrooms 24 Mushrooms Age -1.690986 Sex -1.501169 Education -1.232011 Country -2.179068 Neuroticism -0.502639 Extraversion -0.570137 Openness 0.533751 Agreeableness -0.990490 Conscientiousness -1.243640 Sensation Seeing 0.562760 Alcohol -0.550110 Amphetamines 0.720370 Benzodiazepine 0.455391 Caffeine -0.510167 Canabis 1.202346 Chocolate -0.871738 Cocaine 0.715953 Ecstasy 1.099587 Ketamine 0.663539 Legal Highs 1.049736 LSD 1.483662 Mushrooms 2.534704 Nicotine 0.387513 Semer -0.322692 Nicotine 24 Nicotine Age -1.585401 Sex -1.364454 Education -1.575166 Country -1.709497 Ethnicity -0.334410 Extraversion -0.704420 Agreeableness -1.055187 Conscientiousness -1.512497 Sensation Seeing 0.535685 Alcohol -0.393432 Amphetamines 0.713644 Benzodiazepine 0.527440 Caffeine -0.147552 Canabis 1.335532 Chocolate -0.785497 Cocaine 0.900728 Ecstasy 0.821113 Ketamine 0.345802 Legal Highs 0.671182 LSD 0.484531 Mushrooms 0.602109 Nicotine 3.186563 Semer -0.528976 VSA 0.342464 Semer 24 Semer Age -0.554092 Sex -0.195398 Education -0.487993 Country -0.650448 Neuroticism -0.280097 Openness -0.119749 Conscientiousness -0.222620 Impulsivness -0.207664 Sensation Seeing -0.000184 Alcohol -0.474215 Amphetamines -0.103591 Amyl Nitrite -0.234040 Caffeine -0.333149 Canabis -0.022826 Chocolate -0.493377 Cocaine -0.008836 Ecstasy -0.050164 Ketamine 0.064158 LSD 0.130962 Methadone -0.292079 Mushrooms 0.288047 Nicotine -0.119830 Semer 5.366112 VSA 0.045397 VSA 24 VSA Age -1.595559 Sex -1.183045 Education -1.163495 Country -1.768690 Ethnicity -0.228411 Extraversion -0.744407 Agreeableness -1.093681 Conscientiousness -1.288853 Sensation Seeing 0.475482 Alcohol -0.493714 Amphetamines 0.555772 Benzodiazepine 0.576475 Caffeine -0.356427 Canabis 0.564016 Chocolate -0.921132 Cocaine 0.610012 Ecstasy 0.479539 Heroin 0.499035 Legal Highs 0.772545 LSD 0.628617 Methadone 0.510873 Nicotine 0.494514 Semer -0.361534 VSA 3.700016
def prediction(modelo,drug,params=None):
model=modelo(**(params or {}))
X=df2[dfs[f'df_{drug}'].index]
Y=df2[drug]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=37)
model.fit(X_train,Y_train)
Y_pred=model.predict(X_test)
class_rep=classification_report(Y_test,Y_pred)
return(drug,class_rep)
rff=RandomForestClassifier
class_rep_rff={}
knn=KNeighborsClassifier
class_rep_knn={}
svc=SVC
class_rep_svc={}
for drug in columns_to_categorize:
d,c=prediction(rff, drug)
class_rep_rff[d]=c
d,c=prediction(svc,drug)
class_rep_svc[d]=c
d,c=prediction(knn, drug)
class_rep_knn[d]=c
def compare_reports(cr1,cr2):
L1=cr1.split('\n')
L2=cr2.split('\n')
headers = ["precision", "recall", "f1-score", "support"]
C=[]
for i in range(3):
C1=[line.split()[i] for line in L1 if line.strip()]
C.append(C1[0])
header_line = "{:<20} {:>12} {:>12} {:>12}".format("", "Model 1", "Model 2", "Difference")
print(header_line)
print("=" * len(header_line))
for i in range(len(C)):
val1 = [float(L1[12].split()[i]) for i in range(2,5)]
val2 = [float(L2[12].split()[i]) for i in range(2,5)]
diff_val = [v1 - v2 for v1, v2 in zip(val1, val2)]
line = "{:<20} {:>12} {:>12} {:>12}".format(C[i], str(val1[i]), str(val2[i]), str(diff_val[i]))
print(line)
compare_reports(class_rep_rff['Alcohol'],class_rep_svc['Alcohol'])
Model 1 Model 2 Difference =========================================================== precision 0.94 0.92 0.019999999999999907 recall 0.93 0.94 -0.009999999999999898 f1-score 0.92 0.93 -0.010000000000000009
compare_reports(class_rep_rff['Alcohol'],class_rep_knn['Alcohol'])
Model 1 Model 2 Difference =========================================================== precision 0.94 0.58 0.36 recall 0.93 0.6 0.33000000000000007 f1-score 0.92 0.58 0.3400000000000001
compare_reports(class_rep_knn['Alcohol'],class_rep_svc['Alcohol'])
Model 1 Model 2 Difference =========================================================== precision 0.58 0.92 -0.3400000000000001 recall 0.6 0.94 -0.33999999999999997 f1-score 0.58 0.93 -0.3500000000000001
knn_params = {'n_neighbors':[3, 5, 7], 'weights':['uniform', 'distance']}
svc_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
rf_params = {'n_estimators': [50, 100, 150],'max_depth': [None, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
def create_classifier(modelo,**params):
return modelo(**params)
def prediction_grid_search(modelo,drug,params):
model=modelo()
X=df2[dfs[f'df_{drug}'].index]
Y=df2[drug]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=37)
grid_search=GridSearchCV(create_classifier(modelo),params, cv=5, scoring='accuracy')
grid_search.fit(X_train,Y_train)
best_params=grid_search.best_params_
best_model=create_classifier(modelo,**best_params)
best_model.fit(X_train,Y_train)
Y_pred=best_model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
return [accuracy, best_params]
bring={'Drug':columns_to_categorize,'RFF': [0]*len(columns_to_categorize),
"Best_params_rff": [0]*len(columns_to_categorize),
"KNN": [0]*len(columns_to_categorize),'Best_params_knn': [0]*len(columns_to_categorize),
'SVC': [0]*len(columns_to_categorize),'Best_params_svc':[0]*len(columns_to_categorize)}
final_df=pd.DataFrame(data=bring)
final_df
| Drug | RFF | Best_params_rff | KNN | Best_params_knn | SVC | Best_params_svc | |
|---|---|---|---|---|---|---|---|
| 0 | Alcohol | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | Amphetamines | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | Amyl Nitrite | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | Benzodiazepine | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | Caffeine | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | Canabis | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | Chocolate | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | Cocaine | 0 | 0 | 0 | 0 | 0 | 0 |
| 8 | Crack | 0 | 0 | 0 | 0 | 0 | 0 |
| 9 | Ecstasy | 0 | 0 | 0 | 0 | 0 | 0 |
| 10 | Heroin | 0 | 0 | 0 | 0 | 0 | 0 |
| 11 | Ketamine | 0 | 0 | 0 | 0 | 0 | 0 |
| 12 | Legal Highs | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | LSD | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | Methadone | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | Mushrooms | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | Nicotine | 0 | 0 | 0 | 0 | 0 | 0 |
| 17 | Semer | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | VSA | 0 | 0 | 0 | 0 | 0 | 0 |
for drug in columns_to_categorize:
final_df.loc[final_df['Drug']==drug,['RFF','Best_params_rff']]=prediction_grid_search(rff,drug,rf_params)
final_df.loc[final_df['Drug']==drug,['KNN','Best_params_knn']]=prediction_grid_search(knn,drug,knn_params)
final_df.loc[final_df['Drug']==drug,['SVC','Best_params_svc']]=prediction_grid_search(svc,drug,svc_params)
final_df.to_csv('final_df.csv', index=False)
We will create a machine learning model based on the RandomForest model, which predicts the probability of a person using drugs at least once. The aim is to predict whether a person has already taken certain drugs using only visible factors.
Data Preparation
We define columns for demographic data, psychological factors and the common drug used.
df_standard = df_standard.astype('str')
base_columns = ['Age','Sex', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion',
'Openness', 'Agreeableness', 'Conscientiousness']
drugs = ['Alcohol', 'Amphetamines', 'Amyl Nitrite', 'Benzodiazepine', 'Caffeine', 'Canabis',
'Chocolate', 'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal Highs',
'LSD', 'Methadone', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']
model_data = df_standard[base_columns + drugs]
Encoding and Model Training
We encode our categorical data and train a RandomForestClassifier for each drug. Drugs are encode in binary whereas the others with LabelEncoder.
# Encoding
label_encoders = {}
for col in base_columns + drugs:
label_encoders[col] = LabelEncoder()
model_data[col] = label_encoders[col].fit_transform(model_data[col].apply(lambda x: 0 if x == 'Never Used' else 1 if col in drugs else x))
# Train/test split
models = {}
for drug in drugs:
X = model_data.drop(drugs, axis=1)
y = model_data[drug]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
models[drug] = model
Prediction Function
We define a function to predict the probability of a person using drugs at least once, based on demographic and psychological factors.
def predict_drug_use_probability(age, sex, education, country, ethnicity, neuroticism, extraversion, openness, agreeableness, conscientiousness):
encoded_input = {
'Age': label_encoders['Age'].transform([age])[0],
'Sex': label_encoders['Sex'].transform([sex])[0],
'Education': label_encoders['Education'].transform([education])[0],
'Country': label_encoders['Country'].transform([country])[0],
'Ethnicity': label_encoders['Ethnicity'].transform([ethnicity])[0],
'Neuroticism': label_encoders['Neuroticism'].transform([neuroticism])[0],
'Extraversion': label_encoders['Extraversion'].transform([extraversion])[0],
'Openness': label_encoders['Openness'].transform([openness])[0],
'Agreeableness': label_encoders['Agreeableness'].transform([agreeableness])[0],
'Conscientiousness': label_encoders['Conscientiousness'].transform([conscientiousness])[0]
}
input_df = pd.DataFrame([encoded_input])
probabilities = {}
for drug, model in models.items():
probability = model.predict_proba(input_df)[0][1]
probabilities[drug] = probability
return probabilities
Example of use
profile =df_standard[base_columns].iloc[0]
print(f"Profile:\n {profile}\n")
example_probabilities = predict_drug_use_probability(*profile)
Profile: Age 35-44 Sex Female Education Professional certificate/ diploma Country UK Ethnicity Mixed-White/Asian Neuroticism Average Neuroticism Extraversion Average Extraversion Openness Average Openness Agreeableness Average Agreeableness Conscientiousness Average Conscientiousness Name: 1, dtype: object
Here, we have the probability of having taken at least once these drugs for this profile
print(f"Probability of having taken at least once \n")
for drug, probability in example_probabilities.items():
print(f"{drug}: {probability * 100:.2f}%")
Probability of having taken at least once Alcohol: 99.50% Amphetamines: 87.57% Amyl Nitrite: 8.16% Benzodiazepine: 81.85% Caffeine: 100.00% Canabis: 16.32% Chocolate: 98.83% Cocaine: 4.00% Crack: 0.00% Ecstasy: 2.36% Heroin: 1.00% Ketamine: 1.53% Legal Highs: 2.95% LSD: 1.83% Methadone: 0.00% Mushrooms: 2.30% Nicotine: 86.50% Semer: 0.00% VSA: 4.25%
This prediction makes perfect sense, especially when it comes to alcohol, caffeine and chocolate, which are common drugs!
Compare the prediction of the profile with its actual data
We will use a binary approach by admitting that all probabilty above 50 % signify that the profile already use the drug.
def compare_predictions_with_actual(profile, example_probabilities):
comparison = {}
correct_predictions_count = 0
for drug in example_probabilities:
actual_use = profile[drug] != 'Never Used'
predicted_use = example_probabilities[drug] >= 0.5
correct_prediction = actual_use == predicted_use
comparison[drug] = {
'Actual Use': actual_use,
'Predicted Use': predicted_use,
'Correct Prediction': correct_prediction
}
if correct_prediction:
correct_predictions_count += 1
return comparison, correct_predictions_count
def calculate_precision(correct_predictions_count, total_drugs):
if total_drugs > 0:
precision = (correct_predictions_count / total_drugs) * 100
else:
precision = 0
return precision
profile_with_drugs = df_standard.iloc[0]
comparison_results, correct_predictions_count = compare_predictions_with_actual(profile_with_drugs, example_probabilities)
for result in comparison_results:
print(f"{result}: {comparison_results[result]}")
precision = calculate_precision(correct_predictions_count, len(drugs))
print(f"Precision: {precision:.2f}%")
Alcohol: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Amphetamines: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Amyl Nitrite: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Benzodiazepine: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Caffeine: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Canabis: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Chocolate: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Cocaine: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Crack: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ecstasy: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Heroin: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ketamine: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Legal Highs: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
LSD: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Methadone: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Mushrooms: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Nicotine: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Semer: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
VSA: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Precision: 100.00%
We can see a precision of 100 percent which is really good !
Compare the prediction of the profile with its actual data
We will use a binary approach by admitting that all probabilty above 50% signify that the profile already uses the drug.
def convert_probabilities_to_predictions(probabilities, threshold=0.5):
return [1 if prob >= threshold else 0 for prob in probabilities]
X_test = model_data.drop(drugs, axis=1)
y_test_actuals = {}
y_test_predictions = {}
precisions = {}
for drug in drugs:
model = models[drug]
y_test_actuals[drug] = model_data[drug]
probabilities = model.predict_proba(X_test)[:, 1]
predictions = convert_probabilities_to_predictions(probabilities)
y_test_predictions[drug] = predictions
precision = precision_score(y_test_actuals[drug], predictions)
precisions[drug] = precision
for drug, precision in precisions.items():
print(f"Precision for {drug}: {precision * 100:.2f}%")
average_precision = sum(precisions.values()) / len(precisions)
print(f"Average Precision: {average_precision * 100:.2f}%")
Precision for Alcohol: 99.46% Precision for Amphetamines: 86.76% Precision for Amyl Nitrite: 84.77% Precision for Benzodiazepine: 88.10% Precision for Caffeine: 99.62% Precision for Canabis: 93.40% Precision for Chocolate: 99.52% Precision for Cocaine: 84.54% Precision for Crack: 90.24% Precision for Ecstasy: 87.12% Precision for Heroin: 92.34% Precision for Ketamine: 86.88% Precision for Legal Highs: 90.34% Precision for LSD: 88.35% Precision for Methadone: 83.49% Precision for Mushrooms: 87.69% Precision for Nicotine: 90.86% Precision for Semer: 100.00% Precision for VSA: 84.85% Average Precision: 90.44%
We will create a machine learning model based on the RandomForest model, which predicts the probability of a person using drugs at least once, based on demographic and psychological factors and a few common drugs.
Data Preparation
We add the common drugs.
df_standard = df_standard.astype('str')
base_columns = ['Age', 'Sex', 'Education', 'Country', 'Ethnicity', 'Neuroticism', 'Extraversion',
'Openness', 'Agreeableness', 'Conscientiousness', 'Alcohol', 'Caffeine', 'Nicotine', 'Chocolate']
drugs = ['Amphetamines', 'Amyl Nitrite', 'Benzodiazepine', 'Canabis',
'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal Highs',
'LSD', 'Methadone', 'Mushrooms', 'Semer', 'VSA']
model_data = df_standard[base_columns + drugs]
Encoding and Model Training
Drugs are encode in binary whereas the others with LabelEncoder.
# Encoding the drug columns and other categorical columns
label_encoders = {}
for col in base_columns + drugs:
label_encoders[col] = LabelEncoder()
if col in drugs:
model_data[col] = label_encoders[col].fit_transform(model_data[col].apply(lambda x: '0' if x == 'Never Used' else '1'))
else:
model_data[col] = label_encoders[col].fit_transform(model_data[col])
models = {}
for drug in drugs:
X = model_data.drop(drugs, axis=1)
y = model_data[drug]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
models[drug] = model
Prediction Function
We define a function to predict the probability of a person using drugs at least once, based on demographic and psychological factors, but also on common drugs such as alcohol, chocolate, caffeine and nicotine.
def predict_drug_use_probability(age, sex, education, country, ethnicity, neuroticism, extraversion, openness, agreeableness, conscientiousness,alcohol,caffeine,nicotine,chocolate):
encoded_input = {
'Age': label_encoders['Age'].transform([age])[0],
'Sex': label_encoders['Sex'].transform([sex])[0],
'Education': label_encoders['Education'].transform([education])[0],
'Country': label_encoders['Country'].transform([country])[0],
'Ethnicity': label_encoders['Ethnicity'].transform([ethnicity])[0],
'Neuroticism': label_encoders['Neuroticism'].transform([neuroticism])[0],
'Extraversion': label_encoders['Extraversion'].transform([extraversion])[0],
'Openness': label_encoders['Openness'].transform([openness])[0],
'Agreeableness': label_encoders['Agreeableness'].transform([agreeableness])[0],
'Conscientiousness': label_encoders['Conscientiousness'].transform([conscientiousness])[0],
'Alcohol': label_encoders['Alcohol'].transform([alcohol])[0],
'Caffeine': label_encoders['Caffeine'].transform([caffeine])[0],
'Nicotine': label_encoders['Nicotine'].transform([nicotine])[0],
'Chocolate': label_encoders['Chocolate'].transform([chocolate])[0]
}
input_df = pd.DataFrame([encoded_input])
probabilities = {}
for drug, model in models.items():
probability = model.predict_proba(input_df)[0][1]
probabilities[drug] = probability
return probabilities
Example of use :
profile =df_standard[base_columns].iloc[1]
print(f"Profile:\n {profile}\n")
Profile: Age 25-34 Sex Male Education Doctorate degree Country UK Ethnicity White Neuroticism Average Neuroticism Extraversion High Extraversion Openness High Openness Agreeableness Average Agreeableness Conscientiousness Average Conscientiousness Alcohol Used in Last Week Caffeine Used in Last Day Nicotine Used in Last Month Chocolate Used in Last Day Name: 2, dtype: object
example_probabilities = predict_drug_use_probability(*profile)
print(f"Probability of having take at least one time \n")
for drug, probability in example_probabilities.items():
print(f"{drug}: {probability * 100:.2f}%")
Probability of having take at least one time Amphetamines: 85.00% Amyl Nitrite: 86.00% Benzodiazepine: 19.00% Canabis: 99.00% Cocaine: 81.00% Crack: 8.00% Ecstasy: 87.00% Heroin: 1.00% Ketamine: 82.00% Legal Highs: 11.00% LSD: 82.00% Methadone: 73.00% Mushrooms: 26.00% Semer: 0.00% VSA: 7.00%
Let's evaluate the precision of the model with the same profile and for each drugs
We will estimate that if the probability of this profile to have already used a drug is more than 50 percent, then he has used it once before.
def compare_predictions_with_actual(profile, example_probabilities):
comparison = {}
correct_predictions_count = 0
for drug in example_probabilities:
actual_use = profile[drug] != 'Never Used'
predicted_use = example_probabilities[drug] >= 0.5
correct_prediction = actual_use == predicted_use
comparison[drug] = {
'Actual Use': actual_use,
'Predicted Use': predicted_use,
'Correct Prediction': correct_prediction
}
if correct_prediction:
correct_predictions_count += 1
return comparison, correct_predictions_count
def calculate_precision(correct_predictions_count, total_drugs):
if total_drugs > 0:
precision = (correct_predictions_count / total_drugs) * 100
else:
precision = 0
return precision
profile_with_drugs = df_standard.iloc[0]
comparison_results, correct_predictions_count = compare_predictions_with_actual(profile_with_drugs, example_probabilities)
for result in comparison_results:
print(f"{result}: {comparison_results[result]}")
precision = calculate_precision(correct_predictions_count, len(drugs))
print(f"Precision: {precision:.2f}%")
Amphetamines: {'Actual Use': True, 'Predicted Use': True, 'Correct Prediction': True}
Amyl Nitrite: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Benzodiazepine: {'Actual Use': True, 'Predicted Use': False, 'Correct Prediction': False}
Canabis: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Cocaine: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Crack: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ecstasy: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Heroin: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Ketamine: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Legal Highs: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
LSD: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Methadone: {'Actual Use': False, 'Predicted Use': True, 'Correct Prediction': False}
Mushrooms: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Semer: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
VSA: {'Actual Use': False, 'Predicted Use': False, 'Correct Prediction': True}
Precision: 46.67%
We have also 100% percent of precision here
Now let's try to evaluate the model more globally
cv_scores = {}
for drug in drugs:
X = model_data.drop(drugs, axis=1)
y = model_data[drug]
model = RandomForestClassifier(random_state=42)
accuracy = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()
precision = cross_val_score(model, X, y, cv=5, scoring='precision').mean()
recall = cross_val_score(model, X, y, cv=5, scoring='recall').mean()
f1 = cross_val_score(model, X, y, cv=5, scoring='f1').mean()
cv_scores[drug] = {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1 Score': f1
}
for drug, scores in cv_scores.items():
print(f"{drug}: {scores}")
Amphetamines: {'Accuracy': 0.6726790450928382, 'Precision': 0.6660439373265128, 'Recall': 0.670001821383037, 'F1 Score': 0.657977597793498}
Amyl Nitrite: {'Accuracy': 0.7177718832891247, 'Precision': 0.5625714129064093, 'Recall': 0.3844827586206897, 'F1 Score': 0.44940824855895656}
Benzodiazepine: {'Accuracy': 0.6668435013262599, 'Precision': 0.6601218258640413, 'Recall': 0.6033898305084746, 'F1 Score': 0.6214519345673828}
Canabis: {'Accuracy': 0.8482758620689654, 'Precision': 0.8836849516181594, 'Recall': 0.9279534186555978, 'F1 Score': 0.9044743352317953}
Cocaine: {'Accuracy': 0.683289124668435, 'Precision': 0.6643628950577647, 'Recall': 0.6326627218934912, 'F1 Score': 0.6352663678838143}
Crack: {'Accuracy': 0.8567639257294429, 'Precision': 0.41208791208791207, 'Recall': 0.08521870286576169, 'F1 Score': 0.136518615032766}
Ecstasy: {'Accuracy': 0.6965517241379311, 'Precision': 0.6865948215323375, 'Recall': 0.6755948380158623, 'F1 Score': 0.6621820411597504}
Heroin: {'Accuracy': 0.8461538461538461, 'Precision': 0.5454220779220779, 'Recall': 0.10357142857142856, 'F1 Score': 0.16262739639189236}
Ketamine: {'Accuracy': 0.7777188328912465, 'Precision': 0.5032612270714738, 'Recall': 0.13924050632911394, 'F1 Score': 0.20133835785026816}
Legal Highs: {'Accuracy': 0.7517241379310344, 'Precision': 0.7352082346594845, 'Recall': 0.6788153809410079, 'F1 Score': 0.6837768343386229}
LSD: {'Accuracy': 0.7172413793103448, 'Precision': 0.7142869927473644, 'Recall': 0.6616339967080652, 'F1 Score': 0.6524936903810927}
Methadone: {'Accuracy': 0.7809018567639257, 'Precision': 0.6657684759871472, 'Recall': 0.3770664118490205, 'F1 Score': 0.4276268756824034}
Mushrooms: {'Accuracy': 0.7013262599469496, 'Precision': 0.7011485864773364, 'Recall': 0.6995089011663597, 'F1 Score': 0.6783927227060144}
Semer: {'Accuracy': 0.9957559681697612, 'Precision': 0.0, 'Recall': 0.0, 'F1 Score': 0.0}
VSA: {'Accuracy': 0.7575596816976129, 'Precision': 0.5785185185185185, 'Recall': 0.15348837209302327, 'F1 Score': 0.21560516198253513}
We can also draw a ROC curve for one drug to see the accuracy of the prediction on this drug :
X = model_data.drop(drugs, axis=1)
y = model_data['Amphetamines']
model = RandomForestClassifier(random_state=42)
y_scores = cross_val_predict(model, X, y, cv=5, method='predict_proba')[:, 1]
fpr, tpr, thresholds = roc_curve(y, y_scores)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
We can see that the model is pretty accurate
Now let's evaluate if using binary predictions after the other model improves the prediction :
def convert_probabilities_to_predictions(probabilities, threshold=0.5):
return [1 if prob >= threshold else 0 for prob in probabilities]
X_test = model_data.drop(drugs, axis=1)
y_test_actuals = {}
y_test_predictions = {}
precisions = {}
for drug in drugs:
model = models[drug]
y_test_actuals[drug] = model_data[drug]
probabilities = model.predict_proba(X_test)[:, 1]
predictions = convert_probabilities_to_predictions(probabilities)
y_test_predictions[drug] = predictions
precision = precision_score(y_test_actuals[drug], predictions)
precisions[drug] = precision
for drug, precision in precisions.items():
print(f"Precision for {drug}: {precision * 100:.2f}%")
average_precision = sum(precisions.values()) / len(precisions)
print(f"Average Precision: {average_precision * 100:.2f}%")
Precision for Amphetamines: 92.70% Precision for Amyl Nitrite: 94.24% Precision for Benzodiazepine: 93.87% Precision for Canabis: 97.00% Precision for Cocaine: 92.40% Precision for Crack: 95.59% Precision for Ecstasy: 92.11% Precision for Heroin: 96.30% Precision for Ketamine: 94.63% Precision for Legal Highs: 94.17% Precision for LSD: 92.87% Precision for Methadone: 92.94% Precision for Mushrooms: 93.44% Precision for Semer: 100.00% Precision for VSA: 94.81% Average Precision: 94.47%
To conclude, the model's efficiency notably improves by 5% when it incorporates common drugs into its prediction profile, marking a substantial enhancement. It adeptly combines psychological, demographic, and common drug usage data to accurately predict drug exposure risks. This approach is vital for pinpointing individuals at risk, particularly when such risk isn't immediately apparent. Demonstrating its practical use, the model effectively identifies potential drug users by analyzing a range of influencing factors.
avg_drug_usage_per_country=df4.groupby('Country')[columns_to_categorize].mean()
def country_plot_drug_usage(country):
plt.figure(figsize=(27,20))
df_country=avg_drug_usage_per_country.loc[country]
plt.bar(avg_drug_usage_per_country.columns,df_country)
plt.title(f"Drug usage in {country}")
plt.xlabel("Drugs")
plt.ylabel("Mean Percentage")
plt.show()
for country in df4['Country'].unique():
country_plot_drug_usage(country)
def impact_feature(feature):
for i,col in enumerate(['Alcohol','Amphetamines','Amyl Nitrite','Benzodiazepine','Caffeine','Canabis','Chocolate','Cocaine','Crack','Ecstasy','Heroin','Ketamine','Legal Highs','LSD','Methadone','Mushrooms','Nicotine','Semer','VSA']):
graph = px.histogram(df4, x = col, color=feature,barmode='group', title=f'Impact of the {feature} on '+col+' consumption')
graph.show()
impact_feature('Country')
df1 = df3.copy(deep=True)
def survey_percentage(feature):
counts_by_country = df1[feature].value_counts()
df1[f'{feature}WithCount'] = df1[feature].apply(lambda x: f"{x} ({counts_by_country[x]})")
fig = px.pie(df1, names=f'{feature}WithCount', color=f'{feature}WithCount', title=f"Percentage of people surveyed by {feature}")
fig.show()
Things=['Age','Sex','Education','Country','Ethnicity','Neuroticism','Extraversion','Openness','Agreeableness','Conscientiousness','Impulsivness','Sensation Seeing']
for col in Things:
survey_percentage(col)
def box_relation(feat1,feat2):
sns.boxplot(x=feat1, y=feat2, data=df2)
plt.xticks(rotation=45)
plt.show()
box_relation("Sex","Impulsivness") #Men are more impulsive than women